require(quanteda)
require(stm)
require(mvtnorm)

#### covert the data to stm format ####
load("dfm_matrix.Rdata")

stm.data <- convert(dfm.matrix, to = "stm")
stm.data$meta$year <- factor(stm.data$meta$year)
stm.data$meta$party <- factor(stm.data$meta$party,
                              levels = c("others", "LDP", "JSP", "Komeito", "DSP", "JCP", "NFP", "DPJ", "left", "right"))
stm.data$meta$party.year <- paste(stm.data$meta$party, 
                                  stm.data$meta$year, sep = ".")

#### year-specific coefficient model ####
set.seed(12345)
STM.result.model2 <- stm(documents = stm.data$documents, 
                         vocab = stm.data$vocab, K = 75, 
                         prevalence = ~ female * year + party.year + age + I(age ^ 2) + incumbent + wins, 
                         data = stm.data$meta)
save(STM.result.model2, file = "STM_result_model2.Rdata")

#### compute the first-differences (FDs) (main text) ####
## conduct post-estimation simulation
set.seed(12345)
post.sim.model2 <- estimateEffect(1:75 ~ female * year + party.year + age + I(age ^ 2) + incumbent + wins, 
                                  STM.result.model2, stm.data$meta, nsim = 100)

## function implementing the procedure explained in Online Appendix A.3
predict.fun.model2 <- function(result, year.val, sim.beta) {
  cdata <- subset(result$data, year == year.val)
  cdata[, "year"] <- year.val
  cdata[, "female"] <- 0
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  male.prediction <- tcrossprod(design.matrix, sim.beta)
  male.sim.result <- colMeans(male.prediction)
  male.sim.mean <- mean(male.sim.result)
  male.sim.CI <- quantile(male.sim.result, c(0.025, 0.975))
  cdata[, "female"] <- 1
  design.matrix <- makeDesignMatrix(result$formula, result$data, cdata, sparse = FALSE)
  female.prediction <- tcrossprod(design.matrix, sim.beta)
  female.sim.result <- colMeans(female.prediction)
  female.sim.mean <- mean(female.sim.result)
  female.sim.CI <- quantile(female.sim.result, c(0.025, 0.975))
  dif.sim.result <- colMeans(female.prediction - male.prediction)
  dif.sim.mean <- mean(dif.sim.result)
  dif.sim.CI <- quantile(dif.sim.result, c(0.025, 0.975))
  return(list(male.sim.result = c(male.sim.mean, male.sim.CI),
              female.sim.result = c(female.sim.mean, female.sim.CI),
              dif.sim.result = c(dif.sim.mean, dif.sim.CI), 
              dif.sample = colMeans(female.prediction - male.prediction)))
}

## compute the FDs between women and men varying by year
year.seq <- unique(docvars(dfm.matrix, "year"))
predict.male <- predict.female <- predict.dif <- array(NA, c(75, 8, 3))
dif.matrix <- array(NA, c(75, 8, 10000))
for (i in 1:75) {
  set.seed(12345)
  sim.beta <- do.call(rbind, lapply(post.sim.model2$parameters[[i]], function(x) rmvnorm(100, x$est, x$vcov)))
  for (j in 1:8) {
    predict.stm.result <- predict.fun.model2(post.sim.model2, as.character(year.seq[j]), sim.beta)
    predict.male[i, j, ] <- predict.stm.result$male.sim.result
    predict.female[i, j, ] <- predict.stm.result$female.sim.result
    predict.dif[i, j, ] <- predict.stm.result$dif.sim.result
    dif.matrix[i, j, ] <- predict.stm.result$dif.sample
  }
}

## mean absolute differences (MD)
MD.year <- matrix(NA, 8, 3)
for (i in 1:8) {
  MD <- colMeans(abs(dif.matrix[, i, ]))
  MD.year[i, 1] <- mean(MD)
  MD.year[i, 2] <- quantile(MD, probs = 0.025)
  MD.year[i, 3] <- quantile(MD, probs = 0.975)
}

# difference in the MD between before and after the reform
before.after.dif <- colMeans(colMeans(abs(dif.matrix[, 4:8, ]))) - 
  colMeans(colMeans(abs(dif.matrix[, 1:3, ])))
round(mean(before.after.dif), 4)
round(quantile(before.after.dif, probs = c(0.025, 0.975)), 4)

# Figure 2
cairo_pdf("Figure_2.pdf", 
          width = 4, height = 3, pointsize = 8, family = "Helvetica")
par(mar = c(4, 4, 1, 1), lwd = 0.5)
plot(NULL, NULL, type = "n", bty = "L", 
     xlim = c(1985, 2010), ylim = c(0, 0.022), 
     main = "", xlab = "", ylab = "", xaxt = "n", yaxt = "n")
segments(1994, 0, 1994, 0.018, lty = 3)
text(1994, 0.018, "Electoral\nreform", pos = 3)
points(year.seq, MD.year[, 1], pch = 19)
segments(year.seq, MD.year[, 2], year.seq, MD.year[, 3])
axis(1, at = year.seq, labels = FALSE, lwd = 0.5)
mtext(c("86", "90", "93", "96", "00", "03", "05", "09"), 
      side = 1, line = 1, at = year.seq)
mtext("Election year", side = 1, line = 2.5)
axis(2, lwd = 0.5)
mtext("Mean absolute gender difference", side = 2, line = 2.5)
dev.off()

## trend in gender differences in the prevalence of the five distinctive topics
distinctive.topic.id <- c(42, 47, 58, 69, 46)  # see 1_main_analysis_model1.R
distinctive.topic.labels <- c("Policy support for mothers", "Gender equality",             
                              "Child care", "Pacifism", 
                              "Agricultural administration")

# Figure 3
cairo_pdf("Figure_3.pdf", 
          width = 6, height = 4, pointsize = 8, family = "Helvetica")
layout(matrix(1:6, 2, 3, byrow = TRUE))
par(mar = c(4, 4, 2.5, 1.5), lwd = 0.5)
for (i in 1:5) {
  plot(NULL, NULL, type = "n", bty = "L", 
       xlim = c(1985, 2010), ylim = c(-0.03, 0.08), 
       main = distinctive.topic.labels[i], 
       xlab = "", ylab = "", xaxt = "n", yaxt = "n")
  segments(1994, -0.03, 1994, 0.065, lty = 3)
  text(1994, 0.065, "Electoral\nreform", pos = 3)
  abline(h = 0, lty = 3, col = "gray50")
  segments(year.seq - 0.5, predict.female[distinctive.topic.id[i], , 2], 
           year.seq - 0.5, predict.female[distinctive.topic.id[i], , 3], col = "gray50")
  points(year.seq - 0.5, predict.female[distinctive.topic.id[i], , 1], pch = 19, col = "gray50")
  segments(year.seq, predict.male[distinctive.topic.id[i], , 2], 
           year.seq, predict.male[distinctive.topic.id[i], , 3], col = "gray50")
  points(year.seq, predict.male[distinctive.topic.id[i], , 1], pch = 21, bg = "white", col = "gray50")
  segments(year.seq + 0.5, predict.dif[distinctive.topic.id[i], , 2], 
           year.seq + 0.5, predict.dif[distinctive.topic.id[i], , 3])
  points(year.seq + 0.5, predict.dif[distinctive.topic.id[i], , 1], pch = 4)
  axis(1, at = year.seq, labels = FALSE, lwd = 0.5)
  mtext(c("86", "90", "93", "96", "00", "03", "05", "09"), side = 1, line = 1, at = year.seq, cex = 0.7)
  mtext("Election year", side = 1, line = 2.5, cex = 0.8)
  axis(2, lwd = 0.5)
  mtext("Percent/Percentage points", side = 2, line = 2.5, cex = 0.8)
}
dev.off()